import pandas as pd
from numpy import nan

media_cloud600 = pd.read_excel("Media Cloud 600 Topics.xlsx")
ner = pd.read_excel("sentence_to_wiki.xlsx")
media_cloud_tagger = pd.read_excel("US News Sample - Media Cloud Tagger_above 100 words.xlsx")
category_list = list()
replace_dict = dict()
tag2category_dict = dict()
replace_columns = ["Topic %s"%i for i in range(1,21)]
D = dict()

def topic_category(cate):
    return "TOPIC_"+cate


def fix_topic_category(category):
    if category in replace_dict.keys():
        return replace_dict[category]
    else:
        return category

def article_topic_invert(row_data):
    article = row_data["Article"]
    topics = list(row_data[1:4])
    unique_topics = list(set(topics))
    for category in category_lst+["Unknown"]:
        if category in unique_topics:
            D[category] += [topics.index(category)+1]
        else:
            D[category] += [0]
    D["Article"] += [article]


with open("categorys.txt") as f:
    category_list = [category.strip('\n').split(',') for category in f.readlines()]


for category in category_list:
    if len(category) == 2:
        replace_dict[category[1]] = category[0]

media_cloud600["Topic category"] = media_cloud600["Topic category"].apply(fix_topic_category)
media_cloud600["Topic category"].replace(99, '', inplace=True)

for _, row in media_cloud600.iterrows():
    tag = row["tag"]
    category = row["Topic category"]
    tag2category_dict[tag] = category

ner["Lable And Topic 1"] = ner["LABEL"] + '_' + ner["Topic 1"]
bi_ner = ner[["Article","Lable And Topic 1"]]
count_ner = bi_ner.groupby(["Article","Lable And Topic 1"]).size().reset_index()
stack_count_ner = count_ner.set_index(["Article","Lable And Topic 1"]).unstack('Lable And Topic 1')
stack_count_ner.columns = stack_count_ner.columns.get_level_values(1)
stack_count_ner = stack_count_ner.reset_index()
result = pd.merge(media_cloud_tagger,stack_count_ner,on="Article",how='left')
result = result.fillna(0)
cloud_article_topics = media_cloud_tagger[["Article"]+replace_columns]
category_lst = list(set(tag2category_dict.values()))

for category in category_lst:
    D[category] = []
D["Unknown"] = []
D["Article"] = []

cloud_article_topics.apply(article_topic_invert,axis=1)
df2 = pd.DataFrame(D)
column_name = {column:"TOPIC_"+column for column in df2.columns[:-2]}
df2.rename(column_name,axis=1,inplace=True)
merge_df = pd.merge(df2,result,on="Article")
columns  = sorted(list(merge_df.columns))
for column in list(media_cloud_tagger.columns):
    columns.remove(column)
columns = list(media_cloud_tagger.columns) + columns
merge_df = merge_df[columns]
merge_df.to_excel("result.xlsx")

merge_df.replace(0,nan,inplace=True)
merge_df.dropna(axis=1,how='all',inplace=True)
merge_df.dropna(subset=list(merge_df.columns[52:]),how='all',inplace=True)
df_list=list(merge_df.columns)


df = merge_df[df_list]
df.replace(nan,0,inplace=True)
df.to_excel('result.xlsx')